User Inputs

output.var = params$output.var 
transform.abs = params$transform.abs
log.pred = params$log.pred
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 2
##  $ output.var: chr "y3"
##  $ log.pred  : logi TRUE
# Setup Labels
# alt.scale.label.name = Alternate Scale variable name
#   - if predicting on log, then alt.scale is normal scale
#   - if predicting on normal scale, then alt.scale is log scale
if (log.pred == TRUE){
  output.var.tr = paste0(output.var,'.log')
}
if (log.pred == FALSE){
  output.var.tr = output.var
}

Loading Data

feat  = read.csv('../../Data/features_highprec.csv')
labels = read.csv('../../Data/labels.csv')
predictors = names(dplyr::select(feat,-JobName))
data.ori = inner_join(feat,labels,by='JobName')
#data.ori = inner_join(feat,select_at(labels,c('JobName',output.var)),by='JobName')

Data validation

cc  = complete.cases(data.ori)
data.notComplete = data.ori[! cc,]
data = data.ori[cc,] %>% select_at(c(predictors,output.var,'JobName'))
message('Original cases: ',nrow(data.ori))
## Original cases: 10000
message('Non-Complete cases: ',nrow(data.notComplete))
## Non-Complete cases: 3020
message('Complete cases: ',nrow(data))
## Complete cases: 6980

Output Variable

The Output Variable y3 shows right skewness, so will proceed with a log transofrmation

Histogram

ggplot(gather(select_at(data,output.var)), aes(value)) + 
  geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
  geom_density() + 
  facet_wrap(~key, scales = 'free',ncol=4)

QQPlot

ggplot(gather(select_at(data,output.var)), aes(sample=value)) + 
  stat_qq() + 
  facet_wrap(~key, scales = 'free',ncol=4)

Best Normalizator y3

Normalization of y3 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project

t=bestNormalize::bestNormalize(data[[output.var]])
t
## Best Normalizing transformation with 6980 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - No transform: 3.0147 
##  - Box-Cox: 1.4848 
##  - Log_b(x+a): 2.0717 
##  - sqrt(x+a): 2.492 
##  - exp(x): 748.8387 
##  - arcsinh(x): 2.0717 
##  - Yeo-Johnson: 1.2598 
##  - orderNorm: 1.2159 
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6980 nonmissing obs and no ties 
##  - Original quantiles:
##      0%     25%     50%     75%    100% 
##  95.913 118.289 124.030 131.059 193.726
qqnorm(data[[output.var]])

qqnorm(predict(t))

orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution

Trasformation of Output Variable from y3 to y3.log

if(log.pred==TRUE) data[[output.var.tr]] = log(data[[output.var]],10) else
  data[[output.var.tr]] = data[[output.var]]

ggplot(gather(select_at(data,c(output.var,output.var.tr))), aes(value)) + 
  geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
  geom_density() + 
  facet_wrap(~key, scales = 'free',ncol=4)

Predictors

All predictors show a Fat-Tail situation, where the two tails are very tall, and a low distribution around the mean. The orderNorm transromation can help (see [Best Normalizator] section)

Interesting Predictors

Histogram and QQ plot

cols = c('x11','x18','stat98','x7')
ggplot(gather(select_at(data,cols)), aes(value)) + 
  geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
  geom_density() + 
  facet_wrap(~key, scales = 'free',ncol=2)

ggplot(gather(select_at(data,cols)), aes(sample=value)) + 
  stat_qq()+
  facet_wrap(~key, scales = 'free',ncol=2)

lapply(select_at(data,cols),summary)
## $x11
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 9.000e-08 9.494e-08 1.001e-07 1.001e-07 1.052e-07 1.100e-07 
## 
## $x18
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.500   3.147   4.769   4.772   6.418   7.999 
## 
## $stat98
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -2.998619 -1.551882 -0.015993 -0.005946  1.528405  2.999499 
## 
## $x7
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.700   1.266   1.854   1.852   2.446   3.000

Scatter plot vs. output variable **y3.log

d = gather(dplyr::select_at(data,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) + 
  geom_point(color='light green',alpha=0.5) + 
  geom_smooth() + 
  facet_wrap(~target, scales = 'free',ncol=2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Histograms

All indicators have a strong indication of Fat-Tails

ggplot(gather(select_at(data,predictors)), aes(value)) + 
  geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
  geom_density() + 
  facet_wrap(~key, scales = 'free',ncol=4)

QQPlots

ggplot(gather(select_at(data,predictors)), aes(sample=value)) + 
  stat_qq() + 
  facet_wrap(~key, scales = 'free',ncol=4)

Correlations

With Output Variable

#chart.Correlation(select(data,-JobName),  pch=21)
t=round(cor(dplyr::select(data,-one_of(output.var.tr,'JobName')),select_at(data,output.var.tr)),4)
DT::datatable(t)

All Variables

#chart.Correlation(select(data,-JobName),  pch=21)
t=round(cor(dplyr::select(data,-one_of('JobName'))),4)
DT::datatable(t,options=list(scrollX=T))

Scatter Plots with Output Variable

Scatter plots with all predictors and the output variable (y3.log)

d = gather(dplyr::select_at(data,c(predictors,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) + 
  geom_point(color='light blue',alpha=0.5) + 
  geom_smooth() + 
  facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Multicollinearity - VIF

No Multicollinearity among predictors

Showing Top predictor by VIF Value

vifDF = usdm::vif(select_at(data,predictors)) %>% arrange(desc(VIF))
head(vifDF,10)
##    Variables      VIF
## 1    stat100 1.060877
## 2    stat209 1.060553
## 3     stat20 1.060198
## 4    stat178 1.060048
## 5    stat141 1.059520
## 6    stat207 1.057697
## 7     stat87 1.057394
## 8    stat154 1.057268
## 9    stat135 1.057038
## 10   stat104 1.057036

Feature Eng

  • No trasnformation for x18

  • log transformatio for y3

data.tr=data %>%
  mutate(x18.sqrt = sqrt(x18)) 
cols=c('x18','x18.sqrt')

Comparing Pre and Post Transformation Density Plots

ggplot(gather(select_at(data.tr,cols)), aes(value)) + 
  geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
  geom_density() + 
  facet_wrap(~key, scales = 'free',ncol=4)

d = gather(dplyr::select_at(data.tr,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) + 
  geom_point(color='light blue',alpha=0.5) + 
  geom_smooth() + 
  facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#removing unwanted variables
data.tr=data.tr %>%
  dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('x18sqrt','y3')])

Conclusion

  • the target ariable y3 can be LOG transformed

  • the predictor x18 is not improving with SQR trasformatioatn

  • all predictors could benefit with a orderNorm transformation